In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px 

import warnings
warnings.filterwarnings("ignore")

Read the dataset¶

In [3]:
data = pd.read_csv("../Spotify Music Dataset/data/data.csv")
genre_data = pd.read_csv('../Spotify Music Dataset/data_by_genres.csv')
year_data = pd.read_csv('../Spotify Music Dataset/data_by_year.csv')
artist_data = pd.read_csv('../Spotify Music Dataset/data_by_artist/data_by_artist.csv')
In [4]:
data.head(5)
Out[4]:
valence year acousticness artists danceability duration_ms energy explicit id instrumentalness key liveness loudness mode name popularity release_date speechiness tempo
0 0.0594 1921 0.982 ['Sergei Rachmaninoff', 'James Levine', 'Berli... 0.279 831667 0.211 0 4BJqT0PrAfrxzMOxytFOIz 0.878000 10 0.665 -20.096 1 Piano Concerto No. 3 in D Minor, Op. 30: III. ... 4 1921 0.0366 80.954
1 0.9630 1921 0.732 ['Dennis Day'] 0.819 180533 0.341 0 7xPhfUan2yNtyFG0cUWkt8 0.000000 7 0.160 -12.441 1 Clancy Lowered the Boom 5 1921 0.4150 60.936
2 0.0394 1921 0.961 ['KHP Kridhamardawa Karaton Ngayogyakarta Hadi... 0.328 500062 0.166 0 1o6I8BglA6ylDMrIELygv1 0.913000 3 0.101 -14.850 1 Gati Bali 5 1921 0.0339 110.339
3 0.1650 1921 0.967 ['Frank Parker'] 0.275 210000 0.309 0 3ftBPsC5vPBKxYSee08FDH 0.000028 5 0.381 -9.316 1 Danny Boy 3 1921 0.0354 100.109
4 0.2530 1921 0.957 ['Phil Regan'] 0.418 166693 0.193 0 4d6HGyGT8e121BsdKmw9v6 0.000002 3 0.229 -10.096 1 When Irish Eyes Are Smiling 2 1921 0.0380 101.665

The below table shows the description and distribution of data overall.¶

In [5]:
data.describe()
Out[5]:
valence year acousticness danceability duration_ms energy explicit instrumentalness key liveness loudness mode popularity speechiness tempo
count 170653.000000 170653.000000 170653.000000 170653.000000 1.706530e+05 170653.000000 170653.000000 170653.000000 170653.000000 170653.000000 170653.000000 170653.000000 170653.000000 170653.000000 170653.000000
mean 0.528587 1976.787241 0.502115 0.537396 2.309483e+05 0.482389 0.084575 0.167010 5.199844 0.205839 -11.467990 0.706902 31.431794 0.098393 116.861590
std 0.263171 25.917853 0.376032 0.176138 1.261184e+05 0.267646 0.278249 0.313475 3.515094 0.174805 5.697943 0.455184 21.826615 0.162740 30.708533
min 0.000000 1921.000000 0.000000 0.000000 5.108000e+03 0.000000 0.000000 0.000000 0.000000 0.000000 -60.000000 0.000000 0.000000 0.000000 0.000000
25% 0.317000 1956.000000 0.102000 0.415000 1.698270e+05 0.255000 0.000000 0.000000 2.000000 0.098800 -14.615000 0.000000 11.000000 0.034900 93.421000
50% 0.540000 1977.000000 0.516000 0.548000 2.074670e+05 0.471000 0.000000 0.000216 5.000000 0.136000 -10.580000 1.000000 33.000000 0.045000 114.729000
75% 0.747000 1999.000000 0.893000 0.668000 2.624000e+05 0.703000 0.000000 0.102000 8.000000 0.261000 -7.183000 1.000000 48.000000 0.075600 135.537000
max 1.000000 2020.000000 0.996000 0.988000 5.403500e+06 1.000000 1.000000 1.000000 11.000000 1.000000 3.855000 1.000000 100.000000 0.970000 243.507000
In [6]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170653 entries, 0 to 170652
Data columns (total 19 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   valence           170653 non-null  float64
 1   year              170653 non-null  int64  
 2   acousticness      170653 non-null  float64
 3   artists           170653 non-null  object 
 4   danceability      170653 non-null  float64
 5   duration_ms       170653 non-null  int64  
 6   energy            170653 non-null  float64
 7   explicit          170653 non-null  int64  
 8   id                170653 non-null  object 
 9   instrumentalness  170653 non-null  float64
 10  key               170653 non-null  int64  
 11  liveness          170653 non-null  float64
 12  loudness          170653 non-null  float64
 13  mode              170653 non-null  int64  
 14  name              170653 non-null  object 
 15  popularity        170653 non-null  int64  
 16  release_date      170653 non-null  object 
 17  speechiness       170653 non-null  float64
 18  tempo             170653 non-null  float64
dtypes: float64(9), int64(6), object(4)
memory usage: 24.7+ MB
In [7]:
genre_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2973 entries, 0 to 2972
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   mode              2973 non-null   int64  
 1   genres            2973 non-null   object 
 2   acousticness      2973 non-null   float64
 3   danceability      2973 non-null   float64
 4   duration_ms       2973 non-null   float64
 5   energy            2973 non-null   float64
 6   instrumentalness  2973 non-null   float64
 7   liveness          2973 non-null   float64
 8   loudness          2973 non-null   float64
 9   speechiness       2973 non-null   float64
 10  tempo             2973 non-null   float64
 11  valence           2973 non-null   float64
 12  popularity        2973 non-null   float64
 13  key               2973 non-null   int64  
dtypes: float64(11), int64(2), object(1)
memory usage: 325.3+ KB
In [8]:
year_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   mode              100 non-null    int64  
 1   year              100 non-null    int64  
 2   acousticness      100 non-null    float64
 3   danceability      100 non-null    float64
 4   duration_ms       100 non-null    float64
 5   energy            100 non-null    float64
 6   instrumentalness  100 non-null    float64
 7   liveness          100 non-null    float64
 8   loudness          100 non-null    float64
 9   speechiness       100 non-null    float64
 10  tempo             100 non-null    float64
 11  valence           100 non-null    float64
 12  popularity        100 non-null    float64
 13  key               100 non-null    int64  
dtypes: float64(11), int64(3)
memory usage: 11.1 KB

Box plot of various music features to see the outliers and range of values¶

In [9]:
plt.figure(figsize=(14,6))
boxplot = data.boxplot(column=['valence','acousticness','danceability','energy','explicit','instrumentalness','liveness','speechiness'])
plt.show()

Correlation between the various music features of dataframe¶

In [10]:
# Compute the correlation matrix
corr = data.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

f, ax = plt.subplots(figsize=(7, 9)) # Set up the matplotlib figure
cmap = sns.diverging_palette(230, 20, as_cmap=True) # Generate a custom diverging colormap

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.show()

We'll keep the target as 'popularity' for the future analysis.¶

Now, let's do the Feature Correlation by considering a few features using the yellowbrick package¶

In [11]:
!pip install yellowbrick
Requirement already satisfied: yellowbrick in c:\users\prakh\anaconda3\lib\site-packages (1.5)
Requirement already satisfied: numpy>=1.16.0 in c:\users\prakh\anaconda3\lib\site-packages (from yellowbrick) (1.23.5)
Requirement already satisfied: cycler>=0.10.0 in c:\users\prakh\anaconda3\lib\site-packages (from yellowbrick) (0.11.0)
Requirement already satisfied: scikit-learn>=1.0.0 in c:\users\prakh\anaconda3\lib\site-packages (from yellowbrick) (1.2.1)
Requirement already satisfied: scipy>=1.0.0 in c:\users\prakh\anaconda3\lib\site-packages (from yellowbrick) (1.10.0)
Requirement already satisfied: matplotlib!=3.0.0,>=2.0.2 in c:\users\prakh\anaconda3\lib\site-packages (from yellowbrick) (3.7.0)
Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\prakh\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (1.4.4)
Requirement already satisfied: pyparsing>=2.3.1 in c:\users\prakh\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (3.0.9)
Requirement already satisfied: fonttools>=4.22.0 in c:\users\prakh\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (4.25.0)
Requirement already satisfied: contourpy>=1.0.1 in c:\users\prakh\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (1.0.5)
Requirement already satisfied: python-dateutil>=2.7 in c:\users\prakh\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (2.8.2)
Requirement already satisfied: pillow>=6.2.0 in c:\users\prakh\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (9.4.0)
Requirement already satisfied: packaging>=20.0 in c:\users\prakh\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (22.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\prakh\anaconda3\lib\site-packages (from scikit-learn>=1.0.0->yellowbrick) (2.2.0)
Requirement already satisfied: joblib>=1.1.1 in c:\users\prakh\anaconda3\lib\site-packages (from scikit-learn>=1.0.0->yellowbrick) (1.1.1)
Requirement already satisfied: six>=1.5 in c:\users\prakh\anaconda3\lib\site-packages (from python-dateutil>=2.7->matplotlib!=3.0.0,>=2.0.2->yellowbrick) (1.16.0)
In [12]:
from yellowbrick.target import FeatureCorrelation

feature_names = ['popularity', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness',
                 'loudness', 'speechiness', 'tempo', 'valence', 'duration_ms', 'explicit', 'key', 'mode', 'year']

X, Y = data[feature_names], data['popularity']

# Create a list of the feature names
features = np.array(feature_names)

# Instantiate the visualizer
visualizer = FeatureCorrelation(labels=features)

plt.rcParams['figure.figsize']=(10,5)
visualizer.fit(X, Y)     # Fit the data to the visualizer
visualizer.show()
Out[12]:
<Axes: title={'center': 'Features correlation with dependent variable'}, xlabel='Pearson Correlation'>

Number of songs released over the decades¶

In [13]:
def year_to_decade(year):
    period_start = int(year/10) * 10
    year_decade = '{}s'.format(period_start)
    return year_decade

data['decade'] = data['year'].apply(year_to_decade)

sns.set(rc={'figure.figsize':(11 ,6)})
sns.countplot(data=data, x='decade')
Out[13]:
<Axes: xlabel='decade', ylabel='count'>

Trend of various music features over decades¶

In [14]:
music_features = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'valence']
fig = px.line(year_data, x='year', y=music_features,title='Trend of various music features over decades', 
              width=900, height=500)
fig.show()

Trend of loudness in the music over decades¶

In [15]:
fig = px.line(year_data, x='year', y='loudness',title='Trend of loudness in the music over decades')
fig.show()

Trend of various music features over top 10 genres¶

In [16]:
top10_genres = genre_data.nlargest(10, 'popularity')

fig = px.bar(top10_genres, x='genres', y=['valence', 'energy', 'danceability', 'acousticness'], barmode='stack',
            title='Trend of various music features over top 10 genres', width=900, height=650)
fig.show()

Conclusion :

a. Acousticness and Energy has a high correlation compared to other music features
b. The majority of the songs fall within the time range of 1950s to 2010s.
c. The level of energy in songs has progressively risen over time, while the instrumentalness has declined.
d. The level of acousticness in songs has significantly decreased over the years, particularly since the 1960s.
e. The trend of increasing loudness in songs is unmistakable and has reached its zenith in 2020.
f. Among the top 10 genres, energy and danceability are the most prominent characteristics.
In [17]:
top10_popular_artists = artist_data.nlargest(10, 'popularity')
print('Top 10 Artists that have the most popularity:')
top10_popular_artists[['popularity','artists']].sort_values('popularity',ascending=False)
Top 10 Artists that have the most popularity:
Out[17]:
popularity artists
20966 93.0 Ritt Momney
14354 92.0 Lele Pons
15070 90.0 Los Legendarios
11764 89.0 Jerry Di
7463 88.0 Emilee
23687 88.0 Surf Mesa
28263 88.0 salem ilese
213 87.0 A7S
2343 86.0 Beltito
14378 86.0 Lenny Santos
In [18]:
top10_most_songs_artists = artist_data.nlargest(10, 'count')
print('Top 10 Artists that produced most songs:')
top10_most_songs_artists[['count','artists']].sort_values('count',ascending=False)
Top 10 Artists that produced most songs:
Out[18]:
count artists
8367 3169 Francisco Canaro
28561 2422 Эрнест Хемингуэй
28560 2136 Эрих Мария Ремарк
8434 1459 Frank Sinatra
10714 1256 Ignacio Corsini
27109 1200 Vladimir Horowitz
1682 1146 Arturo Toscanini
2707 1103 Billie Holiday
12378 1061 Johnny Cash
7426 1023 Elvis Presley

Based on the findings of the exploratory data analysis (EDA), the following conclusions can be drawn:

  1. The majority of the songs in the dataset were produced between the 1950s and 2010s.
  2. There is a clear trend of increasing energy in songs over time.
  3. The acousticness of songs has significantly decreased over the decades.
  4. Loudness has consistently increased over the decades, with a noticeable peak in 2020.
  5. Among the top 10 genres, energy and danceability are the most notable features.

Clustering Genres with K-Means¶

In [19]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import numpy as np

kmeans = KMeans(n_clusters=10)
#kmeans.set_params(n__jobs=-1)  # set number of jobs for parallel computing
cluster_pipeline = Pipeline([('scaler', StandardScaler()), ('kmeans', kmeans)])
X = genre_data.select_dtypes(np.number)
cluster_pipeline.fit(X)
Out[19]:
Pipeline(steps=[('scaler', StandardScaler()),
                ('kmeans', KMeans(n_clusters=10))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('scaler', StandardScaler()),
                ('kmeans', KMeans(n_clusters=10))])
StandardScaler()
KMeans(n_clusters=10)
In [20]:
genre_data['cluster'] = cluster_pipeline.predict(X)
genre_data
Out[20]:
mode genres acousticness danceability duration_ms energy instrumentalness liveness loudness speechiness tempo valence popularity key cluster
0 1 21st century classical 0.979333 0.162883 1.602977e+05 0.071317 0.606834 0.361600 -31.514333 0.040567 75.336500 0.103783 27.833333 6 9
1 1 432hz 0.494780 0.299333 1.048887e+06 0.450678 0.477762 0.131000 -16.854000 0.076817 120.285667 0.221750 52.500000 5 4
2 1 8-bit 0.762000 0.712000 1.151770e+05 0.818000 0.876000 0.126000 -9.180000 0.047000 133.444000 0.975000 48.000000 7 3
3 1 [] 0.651417 0.529093 2.328809e+05 0.419146 0.205309 0.218696 -12.288965 0.107872 112.857352 0.513604 20.859882 7 1
4 1 a cappella 0.676557 0.538961 1.906285e+05 0.316434 0.003003 0.172254 -12.479387 0.082851 112.110362 0.448249 45.820071 7 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2968 1 zolo 0.222625 0.547082 2.580991e+05 0.610240 0.143872 0.204206 -11.295878 0.061088 125.494919 0.596155 33.778943 9 3
2969 0 zouglou 0.161000 0.863000 2.063200e+05 0.909000 0.000000 0.108000 -5.985000 0.081300 119.038000 0.845000 58.000000 7 2
2970 1 zouk 0.263261 0.748889 3.060728e+05 0.622444 0.257227 0.089678 -10.289222 0.038778 101.965222 0.824111 46.666667 5 3
2971 0 zurich indie 0.993000 0.705667 1.984173e+05 0.172667 0.468633 0.179667 -11.453333 0.348667 91.278000 0.739000 0.000000 7 1
2972 1 zydeco 0.421038 0.629409 1.716717e+05 0.609369 0.019248 0.255877 -9.854825 0.050491 126.366087 0.808544 30.261905 7 3

2973 rows × 15 columns

In [21]:
# Visualizing the Clusters with t-SNE

from sklearn.manifold import TSNE

tsne_pipeline = Pipeline([('scaler', StandardScaler()), ('tsne', TSNE(n_components=2, verbose=1))])
genre_embedding = tsne_pipeline.fit_transform(X)
projection = pd.DataFrame(columns=['x', 'y'], data=genre_embedding)
projection['genres'] = genre_data['genres']
projection['cluster'] = genre_data['cluster']

fig = px.scatter(
    projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'genres'])
fig.show()
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 2973 samples in 0.013s...
[t-SNE] Computed neighbors for 2973 samples in 0.683s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2973
[t-SNE] Computed conditional probabilities for sample 2000 / 2973
[t-SNE] Computed conditional probabilities for sample 2973 / 2973
[t-SNE] Mean sigma: 0.777516
[t-SNE] KL divergence after 250 iterations with early exaggeration: 76.106277
[t-SNE] KL divergence after 1000 iterations: 1.391782

Clustering Songs with K-Means¶

In [22]:
song_cluster_pipeline = Pipeline([('scaler', StandardScaler()), 
                                  ('kmeans', KMeans(n_clusters=20, 
                                   verbose=False))
                                 ], verbose=False)

X = data.select_dtypes(np.number)
number_cols = list(X.columns)
song_cluster_pipeline.fit(X)
Out[22]:
Pipeline(steps=[('scaler', StandardScaler()),
                ('kmeans', KMeans(n_clusters=20, verbose=False))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('scaler', StandardScaler()),
                ('kmeans', KMeans(n_clusters=20, verbose=False))])
StandardScaler()
KMeans(n_clusters=20, verbose=False)
In [23]:
X
Out[23]:
valence year acousticness danceability duration_ms energy explicit instrumentalness key liveness loudness mode popularity speechiness tempo
0 0.0594 1921 0.98200 0.279 831667 0.211 0 0.878000 10 0.6650 -20.096 1 4 0.0366 80.954
1 0.9630 1921 0.73200 0.819 180533 0.341 0 0.000000 7 0.1600 -12.441 1 5 0.4150 60.936
2 0.0394 1921 0.96100 0.328 500062 0.166 0 0.913000 3 0.1010 -14.850 1 5 0.0339 110.339
3 0.1650 1921 0.96700 0.275 210000 0.309 0 0.000028 5 0.3810 -9.316 1 3 0.0354 100.109
4 0.2530 1921 0.95700 0.418 166693 0.193 0 0.000002 3 0.2290 -10.096 1 2 0.0380 101.665
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
170648 0.6080 2020 0.08460 0.786 301714 0.808 0 0.000289 7 0.0822 -3.702 1 72 0.0881 105.029
170649 0.7340 2020 0.20600 0.717 150654 0.753 0 0.000000 7 0.1010 -6.020 1 68 0.0605 137.936
170650 0.6370 2020 0.10100 0.634 211280 0.858 0 0.000009 4 0.2580 -2.226 0 76 0.0809 91.688
170651 0.1950 2020 0.00998 0.671 337147 0.623 1 0.000008 2 0.6430 -7.161 1 70 0.3080 75.055
170652 0.6420 2020 0.13200 0.856 189507 0.721 1 0.004710 7 0.1820 -4.928 1 74 0.1080 94.991

170653 rows × 15 columns

In [24]:
data
Out[24]:
valence year acousticness artists danceability duration_ms energy explicit id instrumentalness key liveness loudness mode name popularity release_date speechiness tempo decade
0 0.0594 1921 0.98200 ['Sergei Rachmaninoff', 'James Levine', 'Berli... 0.279 831667 0.211 0 4BJqT0PrAfrxzMOxytFOIz 0.878000 10 0.6650 -20.096 1 Piano Concerto No. 3 in D Minor, Op. 30: III. ... 4 1921 0.0366 80.954 1920s
1 0.9630 1921 0.73200 ['Dennis Day'] 0.819 180533 0.341 0 7xPhfUan2yNtyFG0cUWkt8 0.000000 7 0.1600 -12.441 1 Clancy Lowered the Boom 5 1921 0.4150 60.936 1920s
2 0.0394 1921 0.96100 ['KHP Kridhamardawa Karaton Ngayogyakarta Hadi... 0.328 500062 0.166 0 1o6I8BglA6ylDMrIELygv1 0.913000 3 0.1010 -14.850 1 Gati Bali 5 1921 0.0339 110.339 1920s
3 0.1650 1921 0.96700 ['Frank Parker'] 0.275 210000 0.309 0 3ftBPsC5vPBKxYSee08FDH 0.000028 5 0.3810 -9.316 1 Danny Boy 3 1921 0.0354 100.109 1920s
4 0.2530 1921 0.95700 ['Phil Regan'] 0.418 166693 0.193 0 4d6HGyGT8e121BsdKmw9v6 0.000002 3 0.2290 -10.096 1 When Irish Eyes Are Smiling 2 1921 0.0380 101.665 1920s
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
170648 0.6080 2020 0.08460 ['Anuel AA', 'Daddy Yankee', 'KAROL G', 'Ozuna... 0.786 301714 0.808 0 0KkIkfsLEJbrcIhYsCL7L5 0.000289 7 0.0822 -3.702 1 China 72 2020-05-29 0.0881 105.029 2020s
170649 0.7340 2020 0.20600 ['Ashnikko'] 0.717 150654 0.753 0 0OStKKAuXlxA0fMH54Qs6E 0.000000 7 0.1010 -6.020 1 Halloweenie III: Seven Days 68 2020-10-23 0.0605 137.936 2020s
170650 0.6370 2020 0.10100 ['MAMAMOO'] 0.634 211280 0.858 0 4BZXVFYCb76Q0Klojq4piV 0.000009 4 0.2580 -2.226 0 AYA 76 2020-11-03 0.0809 91.688 2020s
170651 0.1950 2020 0.00998 ['Eminem'] 0.671 337147 0.623 1 5SiZJoLXp3WOl3J4C8IK0d 0.000008 2 0.6430 -7.161 1 Darkness 70 2020-01-17 0.3080 75.055 2020s
170652 0.6420 2020 0.13200 ['KEVVO', 'J Balvin'] 0.856 189507 0.721 1 7HmnJHfs0BkFzX4x8j0hkl 0.004710 7 0.1820 -4.928 1 Billetes Azules (with J Balvin) 74 2020-10-16 0.1080 94.991 2020s

170653 rows × 20 columns

In [26]:
song_cluster_labels = song_cluster_pipeline.predict(X)
data['cluster_label'] = song_cluster_labels
In [27]:
# Visualizing the Clusters with PCA

from sklearn.decomposition import PCA

pca_pipeline = Pipeline([('scaler', StandardScaler()), ('PCA', PCA(n_components=2))])
song_embedding = pca_pipeline.fit_transform(X)
projection = pd.DataFrame(columns=['x', 'y'], data=song_embedding)
projection['title'] = data['name']
projection['cluster'] = data['cluster_label']

fig = px.scatter(
    projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'title'])
fig.show()

Recommender System¶

  1. After analyzing and visualizing the data, it's evident that data points for similar genres and types of songs are clustered together.
  2. This is expected since similar genres and songs share similar characteristics such as sound and time periods.
  3. This observation can be utilized to build a recommendation system that recommends songs based on nearby data points to those that a user has already listened to.
  4. To implement this, we can use Spotipy, a Python client for the Spotify Web API, to fetch data and query Spotify's catalog for songs.
  5. To use Spotipy, we will need to install it using the "pip install spotipy" command and create an app on the Spotify Developer's page to obtain the developer's Client ID and secret key.
In [28]:
!pip install spotipy
Requirement already satisfied: spotipy in c:\users\prakh\anaconda3\lib\site-packages (2.23.0)
Requirement already satisfied: six>=1.15.0 in c:\users\prakh\anaconda3\lib\site-packages (from spotipy) (1.16.0)
Requirement already satisfied: redis>=3.5.3 in c:\users\prakh\anaconda3\lib\site-packages (from spotipy) (4.5.4)
Requirement already satisfied: urllib3>=1.26.0 in c:\users\prakh\anaconda3\lib\site-packages (from spotipy) (1.26.14)
Requirement already satisfied: requests>=2.25.0 in c:\users\prakh\anaconda3\lib\site-packages (from spotipy) (2.28.1)
Requirement already satisfied: async-timeout>=4.0.2 in c:\users\prakh\anaconda3\lib\site-packages (from redis>=3.5.3->spotipy) (4.0.2)
Requirement already satisfied: charset-normalizer<3,>=2 in c:\users\prakh\anaconda3\lib\site-packages (from requests>=2.25.0->spotipy) (2.0.4)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\prakh\anaconda3\lib\site-packages (from requests>=2.25.0->spotipy) (2022.12.7)
Requirement already satisfied: idna<4,>=2.5 in c:\users\prakh\anaconda3\lib\site-packages (from requests>=2.25.0->spotipy) (3.4)
In [29]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from collections import defaultdict

sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id='e46c345f7f9a42f3bdf1652714ae64f4',
                                                           client_secret='d207bd6db1304f3fb8051da833a6f25e'))

def find_song(name, year):
    song_data = defaultdict()
    results = sp.search(q= 'track: {} year: {}'.format(name,year), limit=1)
    if results['tracks']['items'] == []:
        return None

    results = results['tracks']['items'][0]
    track_id = results['id']
    audio_features = sp.audio_features(track_id)[0]

    song_data['name'] = [name]
    song_data['year'] = [year]
    song_data['explicit'] = [int(results['explicit'])]
    song_data['duration_ms'] = [results['duration_ms']]
    song_data['popularity'] = [results['popularity']]

    for key, value in audio_features.items():
        song_data[key] = value

    return pd.DataFrame(song_data)
In [30]:
from collections import defaultdict
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist
import difflib

number_cols = ['valence', 'year', 'acousticness', 'danceability', 
               'duration_ms', 'energy', 'explicit','instrumentalness', 
               'key', 'liveness', 'loudness', 'mode', 'popularity', 
               'speechiness', 'tempo']


def get_song_data(song, spotify_data):
    
    try:
        song_data = spotify_data[(spotify_data['name'] == song['name']) & (spotify_data['year'] == song['year'])].iloc[0]
        return song_data
    
    except IndexError:
        return find_song(song['name'], song['year'])
        

def get_mean_vector(song_list, spotify_data):
    
    song_vectors = []
    
    for song in song_list:
        song_data = get_song_data(song, spotify_data)
        if song_data is None:
            print('Warning: {} does not exist in Spotify or in database'.format(song['name']))
            continue
        song_vector = song_data[number_cols].values
        song_vectors.append(song_vector)  
    
    song_matrix = np.array(list(song_vectors))
    return np.mean(song_matrix, axis=0)


def flatten_dict_list(dict_list):
    
    flattened_dict = defaultdict()
    for key in dict_list[0].keys():
        flattened_dict[key] = []
    
    for dictionary in dict_list:
        for key, value in dictionary.items():
            flattened_dict[key].append(value)
            
    return flattened_dict


def recommend_songs( song_list, spotify_data, n_songs=10):
    
    metadata_cols = ['name', 'year', 'artists']
    song_dict = flatten_dict_list(song_list)
    
    song_center = get_mean_vector(song_list, spotify_data)
    scaler = song_cluster_pipeline.steps[0][1]
    scaled_data = scaler.transform(spotify_data[number_cols])
    scaled_song_center = scaler.transform(song_center.reshape(1, -1))
    distances = cdist(scaled_song_center, scaled_data, 'cosine')
    index = list(np.argsort(distances)[:, :n_songs][0])
    
    rec_songs = spotify_data.iloc[index]
    rec_songs = rec_songs[~rec_songs['name'].isin(song_dict['name'])]
    return rec_songs[metadata_cols].to_dict(orient='records')

Here, it gives a recommendation list of songs according to the input given below :

In [36]:
recommend_songs(
    [
        {'name': 'Love Story', 'year':2009, 'artists':'Taylor Swift'},
        {'name': 'Perfect', 'year': 2017, 'artists':'Ed Sheeran'},
        {'name': 'Truly Madly Deeply', 'year': 1997, 'artists':'Daniel Jones'},
    ], data)
Out[36]:
[{'name': 'Before You Go', 'year': 2019, 'artists': "['Lewis Capaldi']"},
 {'name': 'Before You Go', 'year': 2019, 'artists': "['Lewis Capaldi']"},
 {'name': 'Used to Be', 'year': 2018, 'artists': "['AJ Mitchell']"},
 {'name': 'mirrorball', 'year': 2020, 'artists': "['Taylor Swift']"},
 {'name': 'Oh Ana', 'year': 2007, 'artists': "['Mother Mother']"},
 {'name': 'The Village', 'year': 2017, 'artists': "['Wrabel']"},
 {'name': 'Sweet Night', 'year': 2020, 'artists': "['V']"},
 {'name': 'The Only Boy Awake', 'year': 2016, 'artists': "['Meadows']"},
 {'name': 'Like A Cowboy', 'year': 2020, 'artists': "['Parker McCollum']"},
 {'name': 'Someone You Loved', 'year': 2019, 'artists': "['Lewis Capaldi']"}]
In [ ]: